import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
df = pd.read_csv('datasets/california_housing_train.csv')
df.head()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -114.31 | 34.19 | 15.0 | 5612.0 | 1283.0 | 1015.0 | 472.0 | 1.4936 | 66900.0 |
| 1 | -114.47 | 34.40 | 19.0 | 7650.0 | 1901.0 | 1129.0 | 463.0 | 1.8200 | 80100.0 |
| 2 | -114.56 | 33.69 | 17.0 | 720.0 | 174.0 | 333.0 | 117.0 | 1.6509 | 85700.0 |
| 3 | -114.57 | 33.64 | 14.0 | 1501.0 | 337.0 | 515.0 | 226.0 | 3.1917 | 73400.0 |
| 4 | -114.57 | 33.57 | 20.0 | 1454.0 | 326.0 | 624.0 | 262.0 | 1.9250 | 65500.0 |
for col1 in df.columns.to_list():
for col2 in df.columns.to_list():
if col1 != col2:
sns.jointplot(data=df, x=col1, y= col2)
d:\projects\pycharmprojects\ml_2021\venv2\lib\site-packages\seaborn\axisgrid.py:1598: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). f = plt.figure(figsize=(height, height))
ax = sns.violinplot(data=df, x="longitude")
ax = sns.violinplot(data=df, x="latitude")
ax = sns.violinplot(data=df, x="housing_median_age")
ax = sns.violinplot(data=df, x="total_rooms")
ax = sns.violinplot(data=df, x="total_bedrooms")
ax = sns.violinplot(data=df, x="population")
ax = sns.violinplot(data=df, x="households")
ax = sns.violinplot(data=df, x="median_income")
ax = sns.violinplot(data=df, x="median_house_value")
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1a5909bd5b0>
ax = sns.heatmap(df.corr(), linewidths=.5)
X = df.drop(columns=["median_house_value"])
y = df['median_house_value']
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
LinearRegression()
y_pred = regressor.predict(X_test)
'Mean Squared Error: 4795883823.951487'
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f"Coefficients W = {regressor.coef_}")
print(f"Intercept W_0 = {regressor.intercept_}")
Mean Squared Error: 4795883823.951487 Coefficients W = [ -433775.5900699 -405629.64501937 54401.20493929 -304240.57793445 685837.50476176 -1347415.24445914 319816.61761417 579299.89968707] Intercept W_0 = 373877.65779367456